import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.colors as mcolors
from dateutil import parser
# Import Scikit-learn for Machine Learning libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE
Reading data from CSV file
df = pd.read_csv("data.csv")
df.head()
Unnamed: 0 | ID | Country of Origin | Farm Name | Lot Number | Mill | ICO Number | Company | Altitude | Region | ... | Total Cup Points | Moisture Percentage | Category One Defects | Quakers | Color | Category Two Defects | Expiration | Certification Body | Certification Address | Certification Contact | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | Colombia | Finca El Paraiso | CQU2022015 | Finca El Paraiso | NaN | Coffee Quality Union | 1700-1930 | Piendamo,Cauca | ... | 89.33 | 11.8 | 0 | 0 | green | 3 | September 21st, 2023 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
1 | 1 | 1 | Taiwan | Royal Bean Geisha Estate | The 2022 Pacific Rim Coffee Summit,T037 | Royal Bean Geisha Estate | NaN | Taiwan Coffee Laboratory | 1200 | Chiayi | ... | 87.58 | 10.5 | 0 | 0 | blue-green | 0 | November 15th, 2023 | Taiwan Coffee Laboratory 台灣咖啡研究室 | QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... | Lin, Jen-An Neil 林仁安 - 886-289116612 |
2 | 2 | 2 | Laos | OKLAO coffee farms | The 2022 Pacific Rim Coffee Summit,LA01 | oklao coffee processing plant | NaN | Taiwan Coffee Laboratory | 1300 | Laos Borofen Plateau | ... | 87.42 | 10.4 | 0 | 0 | yellowish | 2 | November 15th, 2023 | Taiwan Coffee Laboratory 台灣咖啡研究室 | QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... | Lin, Jen-An Neil 林仁安 - 886-289116612 |
3 | 3 | 3 | Costa Rica | La Cumbre | CQU2022017 | La Montana Tarrazu MIll | NaN | Coffee Quality Union | 1900 | Los Santos,Tarrazu | ... | 87.17 | 11.8 | 0 | 0 | green | 0 | September 21st, 2023 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
4 | 4 | 4 | Colombia | Finca Santuario | CQU2023002 | Finca Santuario | NaN | Coffee Quality Union | 1850-2100 | Popayan,Cauca | ... | 87.08 | 11.6 | 0 | 2 | yellow-green | 2 | March 5th, 2024 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
5 rows × 41 columns
The code identifies and prints the number of duplicate rows in the DataFrame 'df'. It first selects the duplicate rows using the duplicated() function and then prints the shape of the resulting DataFrame to show the number of duplicate rows found.
# Handle duplicates
duplicate_rows_data = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_data.shape)
number of duplicate rows: (0, 41)
The code loops through each column in the DataFrame 'df' and counts the number of distinct values in each column. It prints the column name and the corresponding count of distinct values for each column.
# Loop through each column and count the number of distinct values
for column in df.columns:
num_distinct_values = len(df[column].unique())
print(f"{column}: {num_distinct_values} distinct values")
Unnamed: 0: 207 distinct values ID: 207 distinct values Country of Origin: 22 distinct values Farm Name: 173 distinct values Lot Number: 188 distinct values Mill: 163 distinct values ICO Number: 68 distinct values Company: 72 distinct values Altitude: 98 distinct values Region: 121 distinct values Producer: 173 distinct values Number of Bags: 55 distinct values Bag Weight: 39 distinct values In-Country Partner: 21 distinct values Harvest Year: 7 distinct values Grading Date: 75 distinct values Owner: 80 distinct values Variety: 49 distinct values Status: 1 distinct values Processing Method: 11 distinct values Aroma: 19 distinct values Flavor: 19 distinct values Aftertaste: 20 distinct values Acidity: 19 distinct values Body: 17 distinct values Balance: 18 distinct values Uniformity: 3 distinct values Clean Cup: 1 distinct values Sweetness: 1 distinct values Overall: 21 distinct values Defects: 1 distinct values Total Cup Points: 81 distinct values Moisture Percentage: 46 distinct values Category One Defects: 6 distinct values Quakers: 11 distinct values Color: 12 distinct values Category Two Defects: 14 distinct values Expiration: 75 distinct values Certification Body: 21 distinct values Certification Address: 21 distinct values Certification Contact: 21 distinct values
The code calculates the missing ratio for each column in the DataFrame 'df' by finding the percentage of missing values. It filters out columns with no missing values, sorts the remaining columns in descending order based on the missing ratio, and creates a DataFrame called 'missing_data' to display the missing ratios for the top 20 columns with missing values.
#check missing ratio
data_na = (df.isnull().sum() / len(df)) * 100
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :data_na})
missing_data.head(20)
Missing Ratio | |
---|---|
ICO Number | 63.768116 |
Variety | 2.898551 |
Processing Method | 2.415459 |
Mill | 1.449275 |
Farm Name | 0.966184 |
Region | 0.966184 |
Lot Number | 0.483092 |
Altitude | 0.483092 |
Producer | 0.483092 |
The code creates a mapping dictionary called 'processing_mapping' to map specific values in the 'Processing Method' column to their corresponding categories. It then applies this mapping to the 'Processing Method' column in the DataFrame 'df' using the map() function. If there are any missing values in the column, they are filled with the value "Washed / Wet" using the fillna() method.
# Mapping the Education
processing_mapping = {
"Double Anaerobic Washed": "Washed / Wet",
"Semi Washed": "Washed / Wet",
"Honey,Mossto": "Pulped natural / honey",
"Double Carbonic Maceration / Natural": "Natural / Dry",
"Wet Hulling": "Washed / Wet",
"Anaerobico 1000h": "Washed / Wet",
"SEMI-LAVADO": "Natural / Dry"
}
# Fixing the values in the column
df['Processing Method'] = df['Processing Method'].map(processing_mapping)
df['Processing Method'].fillna("Washed / Wet", inplace=True)
The first part of the code manually imputes specific values for the 'Altitude' column based on the corresponding 'ID' values.
The second part defines a function called 'clean_altitude_range' that cleans and calculates the mean for each value in the 'Altitude' column. The function removes blank spaces, handles ranges of values (e.g., "1000-2000"), and converts the values to integers. The function is then applied to the 'Altitude' column using the apply() method to clean and calculate the mean for each value in the column.
# Manually impute specific values based on ID (Which we cant use function)
df.loc[df['ID'] == 99, 'Altitude'] = 5273 # Impute value for ID 99
df.loc[df['ID'] == 105, 'Altitude'] = 1800 # Impute value for ID 105
df.loc[df['ID'] == 180, 'Altitude'] = 1400 # Impute value for ID 180
# Define a function to clean and calculate the mean
def clean_altitude_range(range_value):
if isinstance(range_value, str):
range_value = range_value.replace(" ", "") # Remove blank spaces
if '-' in range_value:
try:
start, end = range_value.split('-')
start = int(start)
end = int(end)
return (start + end) / 2
except ValueError:
return np.nan
else:
try:
return int(range_value)
except ValueError:
return np.nan
else:
return range_value
# Apply the function to clean and calculate the mean for each value in the "Altitude" column
df['Altitude'] = df['Altitude'].apply(clean_altitude_range)
The code extracts the prior year from the "Harvest Year" column by splitting the values at '/' and selecting the first part. It then removes any leading or trailing spaces from the extracted year.
# Extract the prior year from the "Harvest Year" column
df['Harvest Year'] = df['Harvest Year'].str.split('/').str[0].str.strip()
The code converts the "Harvest Year" column to datetime objects using the specified date format '%Y'. It also converts the "Expiration" column to datetime objects using the dateutil parser.
# Convert "Harvest Year" and "Expiration" columns to datetime objects using dateutil parser
df['Harvest Year'] = pd.to_datetime(df['Harvest Year'], format='%Y')
df['Expiration'] = df['Expiration'].apply(parser.parse)
The code calculates the difference in days between the "Expiration" and "Harvest Year" columns and assigns the result to a new column called "Coffee Age".
# Calculate the difference in days between "Expiration" and "Harvest Year" columns
df['Coffee Age'] = (df['Expiration'] - df['Harvest Year']).dt.days
The code drops multiple columns specified in the list columns_to_drop from the DataFrame df using the drop function with the axis=1 parameter to indicate column-wise removal. The changes are made in place by setting inplace=True.
columns_to_drop = ['ID','ICO Number','Owner','Region','Certification Contact','Certification Address','Farm Name',"Lot Number","Mill","ICO Number","Producer",'Company','Expiration', 'Harvest Year',
"Unnamed: 0",'Number of Bags','Bag Weight','In-Country Partner','Grading Date','Variety','Status','Defects','Uniformity','Clean Cup','Sweetness','Certification Body']
df.drop(columns_to_drop, axis=1, inplace=True)
The code creates a subplot grid with each subplot representing a numeric attribute from the numeric_attributes list. A histogram is added to each subplot using the data from the corresponding attribute in the DataFrame df. The resulting figure is displayed with a specified height, width, and title.
# List of numeric attributes
numeric_attributes = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Overall', 'Total Cup Points', 'Moisture Percentage','Coffee Age']
# Create a subplot for each numeric attribute
fig = make_subplots(rows=len(numeric_attributes), cols=1)
# Add a histogram to the subplot for each numeric attribute
for i, attribute in enumerate(numeric_attributes):
fig.add_trace(go.Histogram(x=df[attribute], nbinsx=50, name=attribute), row=i+1, col=1)
fig.update_layout(height=200*len(numeric_attributes), width=800, title_text="Histograms of Numeric Attributes")
fig.show()
The code first groups the data by country and calculates the mean of the "Total Cup Points" for each country. It then creates a Choropleth map using the grouped data to visualize the average total cup points by country. Additionally, a bar plot is created using seaborn library to show the same information, where each bar represents a country's average total cup points.
# Group the data by country and calculate the mean of Total Cup Points
df_grouped = df.groupby('Country of Origin')['Total Cup Points'].mean().reset_index()
# Create a Choropleth map
fig = px.choropleth(df_grouped,
locations='Country of Origin',
locationmode='country names',
color='Total Cup Points',
hover_name='Country of Origin',
color_continuous_scale=px.colors.sequential.Plasma,
title='Average Total Cup Points by Country')
fig.show()
# Create a bar plot with gray color
plt.figure(figsize=(10, 5))
sns.barplot(x=df_grouped['Country of Origin'], y=df_grouped['Total Cup Points'], color='gray')
plt.title('Average Total Cup Points by Country')
plt.xlabel('Country of Origin')
plt.ylabel('Average Total Cup Points')
plt.xticks(rotation=90)
plt.show()
The code groups the data by country and calculates the mean of the "Coffee Age" (shelter life) for each country. It then creates a Choropleth map using the grouped data to visualize the average coffee shelter life by country. Additionally, a bar plot is created using seaborn library to show the same information, where each bar represents a country's average coffee shelter life in days.
# Group the data by country and calculate the mean of Total Cup Points
df_grouped = df.groupby('Country of Origin')['Coffee Age'].mean().reset_index()
# Create a Choropleth map
fig = px.choropleth(df_grouped,
locations='Country of Origin',
locationmode='country names',
color='Coffee Age',
hover_name='Country of Origin',
color_continuous_scale=px.colors.sequential.Plasma,
title='Average Coffee Shelter Life by Country (Days)')
fig.show()
# Create a bar plot with gray color
plt.figure(figsize=(10, 5))
sns.barplot(x=df_grouped['Country of Origin'], y=df_grouped['Coffee Age'], color='gray')
plt.title('Average Coffee Shelter Life by Country (Days)')
plt.xlabel('Country of Origin')
plt.ylabel('Average Coffee Shelter Life')
plt.xticks(rotation=90)
plt.show()
The code groups the data by country and calculates the mean of the "Altitude" for each country. It then creates a Choropleth map using the grouped data to visualize the average altitude by country. Additionally, a bar plot is created using seaborn library to show the same information, where each bar represents a country's average altitude.
# Group the data by country and calculate the mean of Altitude
df_grouped = df.groupby('Country of Origin')['Altitude'].mean().reset_index()
# Create a Choropleth map
fig = px.choropleth(df_grouped,
locations='Country of Origin',
locationmode='country names',
color='Altitude',
hover_name='Country of Origin',
color_continuous_scale=px.colors.sequential.Plasma,
title='Average Altitude by Country')
fig.show()
# Create a bar plot with gray color
plt.figure(figsize=(10, 5))
sns.barplot(x=df_grouped['Country of Origin'], y=df_grouped['Altitude'], color='gray')
plt.title('Average Altitude by Country')
plt.xlabel('Country of Origin')
plt.ylabel('Average Altitude')
plt.xticks(rotation=90)
plt.show()
The code counts the unique occurrences of each country in the dataset and creates a choropleth map to visualize the count of unique countries. It also creates a bar plot to display the same information, where each bar represents a country and its count.
# Count the unique occurrences of each country
df_count = df['Country of Origin'].value_counts().reset_index()
df_count.columns = ['Country of Origin', 'Count']
# Create a choropleth map
fig = px.choropleth(df_count,
locations='Country of Origin',
locationmode='country names',
color='Count',
hover_name='Country of Origin',
color_continuous_scale=px.colors.sequential.Plasma,
title='Count of Unique Countries')
fig.show()
# Create a bar plot
plt.figure(figsize=(10, 5))
sns.barplot(x=df_count['Country of Origin'], y=df_count['Count'], color='gray')
plt.title('Count of Unique Countries')
plt.xlabel('Country of Origin')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
The code creates a copy of the DataFrame df and assigns it to the variable data. It then drops the specified columns from data. Next, it generates dummy variables for the categorical columns specified in categorical_columns using one-hot encoding, resulting in the DataFrame dummy_variables.
data = df.copy()
categorical_columns = ['Processing Method']
numerical_columns = ['Altitude', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Overall', 'Total Cup Points', 'Moisture Percentage', 'Category One Defects', 'Quakers', 'Category Two Defects', 'Coffee Age']
columns_to_drop = ['Country of Origin', 'Color']
data.drop(columns_to_drop, axis=1, inplace=True)
dummy_variables = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
The code creates a StandardScaler object named scaler. It then scales the numerical columns specified in numerical_columns of the data DataFrame using the fit_transform method of the scaler. The scaled values are stored in the scaled_numerical array. Finally, the scaled values are converted into a DataFrame named scaled_numerical_df with column names specified by numerical_columns.
scaler = StandardScaler()
# Scale the numerical columns
scaled_numerical = scaler.fit_transform(data[numerical_columns])
# Convert the scaled numerical columns
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_columns)
The code drops the original numerical columns from the dummy_variables DataFrame using the drop method. Then, it concatenates the remaining dummy variables and the scaled numerical columns (scaled_numerical_df) along the columns axis using the pd.concat function. The resulting DataFrame is stored in processed_df.
# Drop the original numerical columns
dummy_variables = dummy_variables.drop(numerical_columns, axis=1)
# Concatenate the dummy variables and scaled numerical columns
processed_df = pd.concat([dummy_variables, scaled_numerical_df], axis=1)
In Graph I, the code generates a correlation matrix for the processed_df DataFrame using the corr() method. It then creates a heatmap using seaborn's heatmap function to visualize the correlation matrix.
In Graph II, the code calculates the correlation between each feature in processed_df and the target column 'Total Cup Points' (target_corr). It sorts the correlation values in descending order and creates a heatmap using seaborn's heatmap function to visualize the correlations with the target column.
correlation_matrix = processed_df.corr()
#Graph I.
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title("Correlation Matrix Heatmap")
plt.show()
corr = processed_df.corr()
target_corr = corr['Total Cup Points'].drop('Total Cup Points')
# Sort correlation values in descending order
target_corr_sorted = target_corr.sort_values(ascending=False)
#Graph II
# Create a heatmap of the correlations with the target column
sns.set(font_scale=0.8)
sns.set_style("white")
sns.set_palette("PuBuGn_d")
sns.heatmap(target_corr_sorted.to_frame(), cmap="coolwarm", annot=True, fmt='.2f')
plt.title('Correlation with Total Cup Points')
plt.show()
The code performs several preprocessing steps on the df DataFrame. It creates a copy of the DataFrame as df_preprocessed. It applies label encoding to categorical variables using LabelEncoder, and min-max scaling to numerical/ratio variables using MinMaxScaler. Then, it applies t-SNE (t-Distributed Stochastic Neighbor Embedding) to reduce the dimensionality of the preprocessed data to 2D. Finally, it creates an interactive scatter plot using Plotly to visualize the t-SNE results, with the points colored by the 'Total Cup Points' feature.
# Create a copy of the dataframe to not alter the original
df_preprocessed = df.copy()
# Preprocessing: Label encoding for categorical variables
le = LabelEncoder()
categorical_features = ['Country of Origin', 'Processing Method', 'Color']
for feature in categorical_features:
df_preprocessed[feature] = le.fit_transform(df[feature])
# Preprocessing: MinMax scaling for numerical/ratio variables
mm = MinMaxScaler()
numerical_features = ['Altitude', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Overall', 'Total Cup Points', 'Moisture Percentage', 'Category One Defects', 'Quakers', 'Category Two Defects', 'Coffee Age']
for feature in numerical_features:
df_preprocessed[feature] = mm.fit_transform(df[feature].values.reshape(-1,1))
# Apply t-SNE with different perplexity and learning rate
tsne = TSNE(n_components=2, random_state=42, perplexity=50, learning_rate=200)
tsne_results = tsne.fit_transform(df_preprocessed)
# Plotly Interactive plot
df_tsne = pd.DataFrame(data = tsne_results, columns = ['Dim_1', 'Dim_2'])
df_tsne['Total Cup Points'] = df['Total Cup Points']
fig = px.scatter(df_tsne, x='Dim_1', y='Dim_2', color='Total Cup Points', title='t-SNE plot colored by Total Cup Points')
fig.show()
We can see the following from the plot:
Clusters: The dots on the figure appear to form clusters, implying that your dataset contains groups of coffee samples with similar properties. Color Gradient: The color gradient that depicts the 'Total Cup Points' follows a distinct pattern across the clusters. This shows that the quality of coffee differs throughout each cluster. Outliers: The figure does not appear to contain any obvious outliers, implying that the majority of the coffee samples have comparable features.
df.head()
Country of Origin | Altitude | Processing Method | Aroma | Flavor | Aftertaste | Acidity | Body | Balance | Overall | Total Cup Points | Moisture Percentage | Category One Defects | Quakers | Color | Category Two Defects | Coffee Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Colombia | 1815.0 | Washed / Wet | 8.58 | 8.50 | 8.42 | 8.58 | 8.25 | 8.42 | 8.58 | 89.33 | 11.8 | 0 | 0 | green | 3 | 993 |
1 | Taiwan | 1200.0 | Washed / Wet | 8.50 | 8.50 | 7.92 | 8.00 | 7.92 | 8.25 | 8.50 | 87.58 | 10.5 | 0 | 0 | blue-green | 0 | 1048 |
2 | Laos | 1300.0 | Washed / Wet | 8.33 | 8.42 | 8.08 | 8.17 | 7.92 | 8.17 | 8.33 | 87.42 | 10.4 | 0 | 0 | yellowish | 2 | 1048 |
3 | Costa Rica | 1900.0 | Washed / Wet | 8.08 | 8.17 | 8.17 | 8.25 | 8.17 | 8.08 | 8.25 | 87.17 | 11.8 | 0 | 0 | green | 0 | 628 |
4 | Colombia | 1975.0 | Pulped natural / honey | 8.33 | 8.33 | 8.08 | 8.25 | 7.92 | 7.92 | 8.25 | 87.08 | 11.6 | 0 | 2 | yellow-green | 2 | 794 |
The code defines the preprocessor using ColumnTransformer to scale the numerical columns using MinMaxScaler and one-hot encode the categorical columns using OneHotEncoder. It then creates a pipeline (clf) that combines the preprocessor and a RandomForestRegressor as the classifier. The data is split into training and test sets (X_train, X_test, y_train, y_test) using train_test_split.
categorical_columns = ['Processing Method','Country of Origin', 'Color']
numerical_columns = ['Altitude', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Overall', 'Total Cup Points', 'Moisture Percentage', 'Category One Defects', 'Quakers', 'Category Two Defects', 'Coffee Age']
# Define preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', MinMaxScaler(), [col for col in numerical_columns if col != 'Total Cup Points']),
('cat', OneHotEncoder(), categorical_columns)])
# Append classifier to preprocessing pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=100, random_state=42))])
# Split the data into train and test sets
X = df.drop('Total Cup Points', axis=1)
y = df['Total Cup Points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
The code trains the clf model using the training data (X_train, y_train), then predicts the target variable (Total Cup Points) for the test data (X_test) and stores the predictions in y_pred. Finally, it calculates the mean squared error (mse) between the actual target values (y_test) and the predicted values, and prints it.
# Train the model
clf.fit(X_train, y_train)
# Predict on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
Mean Squared Error: 0.12949020428571217
The code creates a scatter plot of the residuals, which are the differences between the actual target values (y_test) and the predicted values (y_pred). The plot helps visualize the distribution and patterns of the residuals.
# Plot residuals
plt.figure(figsize=(10, 6))
residuals = y_test - y_pred
sns.scatterplot(x=y_test, y=residuals)
plt.title('Residuals Plot')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.show()
We can see the difference between the actual and projected values in a regression model using this style of visualization.
Let's dissect the plot:
In an ideal model, all of the points would align along the horizontal line at y=0. This line displays the cases in which the anticipated value perfectly matches the actual value.
When we examine the plot, we notice that the residuals are scattered around the y=0 line, which is a good sign. This suggests that the faults in the model appear to be randomly distributed with no evident pattern.
Nonetheless, certain spots stray from the y=0 line, suggesting places where the model's predictions were incorrect.
Remember that the distance between the points and the y=0 line is a measure of prediction quality. Points far from the line show projections that deviate greatly from the actual values.
Furthermore, the density of points might reveal information about the distribution of faults. If you find a large density of points far from the y=0 line, this might indicate that the model has systemic mistakes.
The code creates a scatter plot of the predicted values (y_pred) versus the actual values (y_test). It visualizes the prediction errors and shows how well the predicted values align with the actual values.
# Plot prediction error
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('Prediction Error Plot')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()
We may investigate the differences between real and anticipated values in a regression model using this sort of visualization. It is usually referred to as the Prediction Error Plot.
Let's dissect the plot:
All of the points would align along the diagonal line in an ideal scenario with flawless forecasts. This line reflects cases in which the anticipated value perfectly matches the actual value.
When we examine the plot, we can see that the majority of the points are quite near to the diagonal line, indicating that the model makes reasonably accurate predictions. However, certain spots stray from the line, revealing locations where the model's predictions were incorrect.
Keep in mind that the proximity of the dots to the diagonal line represents the accuracy of the forecasts. Points far from the line indicate forecasts that deviate greatly from the actual values.
Furthermore, the density of points might reveal information about the distribution of faults. If you see a large density of points away from the diagonal line, this might suggest that the model has systemic mistakes.
To summarize, this graphic provides a visual way to evaluate the performance of a regression model. It is a useful tool for determining where the model produces errors and the size of those faults.